import pandas as pd
import numpy as np
import seaborn as sns
np.random.seed(44)
from google.colab import drive
drive.mount("/content/gdrive")
train = pd.read_csv('/content/gdrive/My Drive/3253 Machine Learning Term Project/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/3253 Machine Learning Term Project/test.csv')
print(train.shape)
train.shape
train.info()
#count the unique values in "Target", which is the prediction we are going to make
train["Target"].value_counts()
#Check if the data is balanced
%matplotlib inline
import matplotlib.pyplot as plt
def check_data_balance(series, style="seaborn-pastel"):
with plt.style.context(style):
unique = series.value_counts()
display(unique) #show unique value counts of the target
plt.pie(unique, explode=[0.05]*len(unique), labels=unique.index, autopct='%1.1f%%'); #plot a pie chart for the target to see if data are balanced
check_data_balance(train["Target"])
#inspect the data using the dataframe's describe() function
train.describe()
%matplotlib inline
import matplotlib.pyplot as plt
#draw histogram of each feature
train.hist(bins=50, figsize=(20,15))
#save_fig("attribute_histogram_plots")
plt.show()
import plotly.express as px
fig = px.scatter_matrix(train.iloc[:,1:8], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,8:16], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,16:24], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,24:32], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,32:37], height=1000)
fig.show()
len(train.columns)
# Check PreviousCampaignResult feature values - almost all of them are zero
train['PreviousCampaignResult'].value_counts()
# Check values for Product features
products = train[['Product1', 'Product2', 'Product3', 'Product4', 'Product5', 'Product6']]
#products
products.apply(pd.Series.value_counts)
# Products value counts in % contribution
products.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
# Box plots for Transaction features
plt.subplot(331)
sns.boxplot(train["Transaction1"])
plt.subplot(332)
sns.boxplot(train["Transaction2"])
plt.subplot(333)
sns.boxplot(train["Transaction3"])
plt.subplot(334)
sns.boxplot(train["Transaction4"])
plt.subplot(335)
sns.boxplot(train["Transaction5"])
plt.subplot(336)
sns.boxplot(train["Transaction6"])
plt.subplot(337)
sns.boxplot(train["Transaction7"])
plt.subplot(338)
sns.boxplot(train["Transaction8"])
plt.subplot(339)
sns.boxplot(train["Transaction9"])
fig = plt.gcf()
fig.set_size_inches(10,10)
# Count non-zero values in each Transaction column
transactions = train[['Transaction1', 'Transaction2', 'Transaction3', 'Transaction4', 'Transaction5', 'Transaction6', 'Transaction7',
'Transaction8', 'Transaction9']]
# transactions
transactions.apply(lambda x: np.count_nonzero(x))
# Check values for External Account features
external_accounts = train[['ExternalAccount1', 'ExternalAccount2', 'ExternalAccount3', 'ExternalAccount4', 'ExternalAccount5', 'ExternalAccount6', 'ExternalAccount7']]
# external_accounts
external_accounts.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
# Check values for Activity Indicator feature
sns.boxplot(train['ActivityIndicator'])
# Group values into buckets by percentile
p50 = np.percentile(train["ActivityIndicator"],50)
p75 = np.percentile(train["ActivityIndicator"],75)
p99 = np.percentile(train["ActivityIndicator"],99)
bins = [-1, 0, p50, p75, p99, np.inf]
train["ActivityIndicator"].value_counts(bins=bins, sort=False, normalize=True).mul(100).round(1).astype(str) + '%'
# Check target variable for customers with no activity
inactive = train[train['ActivityIndicator'] == 0]
inactive["Target"].value_counts()
inactive["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
# Check target variable for customers with some activity
active = train[train['ActivityIndicator'] != 0]
active["Target"].value_counts()
active["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
# Check values for Regular Interaction Indicator feature
sns.countplot(train["RegularInteractionIndicator"])
# Group values into buckets by percentile
p50 = np.percentile(train["RegularInteractionIndicator"],50)
p75 = np.percentile(train["RegularInteractionIndicator"],75)
p99 = np.percentile(train["RegularInteractionIndicator"],99)
bins = [-1, p50, p75, p99, np.inf]
train["RegularInteractionIndicator"].value_counts(bins=bins, sort=False, normalize=True).mul(100).round(1).astype(str) + '%'
# Check target variable for customers with zero interaction frequency score
infrequent = train[train['RegularInteractionIndicator'] == 0]
infrequent["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
# Summarize rate offers
rates = train[['CompetitiveRate1', 'CompetitiveRate2', 'CompetitiveRate4', 'CompetitiveRate4', 'CompetitiveRate5', 'CompetitiveRate6', 'CompetitiveRate7',
'RateBefore', 'ReferenceRate']]
rates.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
# Check if there's correlation between balance and activity/interaction indicators
train_corr = train[['Balance', 'ActivityIndicator', 'RegularInteractionIndicator']].copy()
# sns.pairplot(train_corr)
sns.heatmap(train_corr.corr(),
annot=True,
linewidth=.5,
center = 0,
fmt='.1g',
cbar=False,
cmap='GnBu')
Observations:
X = train.drop(['Target'], axis=1)
X = X.set_index('Customer_id')
y = train[['Target']]
Use 1.5xIQR rule to remove outliers. Anything beyond 1.5 x 3rd quartile are considered high outliers, anything lower than -1.5 x 1st quartile are low outliers.
High outliers and low outliers will be be replaced by median of balance feature.
X['Balance'].quantile([0.25,0.5,0.75])
q1 = X['Balance'].quantile([0.25])[0.25]
q3 = X['Balance'].quantile([0.75])[0.75]
q2 = X['Balance'].quantile([0.5])[0.5]
q4 = X['Balance'].quantile([1])[1]
IQR = q3 - q1
low_outliers = q1 - 1.5* IQR
high_outliers = q3 + 1.5*IQR
X['Balance'] = X['Balance'].apply( lambda x: X[['Balance']].median()['Balance'] if x>high_outliers else x)
X['Balance'] = X['Balance'].apply( lambda x: X[['Balance']].median()['Balance'] if x<low_outliers else x)
from google.colab import files
X[['Balance']].boxplot().get_figure().savefig('outlier_balance.png')
files.download('outlier_balance.png')
Isolate the transaction feature per customer
X_transactions = X.loc[:,'Transaction1':'Transaction9']
Use DBSCAN to remove outliers for each of transaction/feature of transaction
from sklearn.cluster import DBSCAN
DBSCAN = DBSCAN(eps=2000, min_samples=4, n_jobs=-1).fit(X_transactions)
Below, I'm checking how many instances are per cluster. The variances are so far in between that there are many clusters. I'll be isolating those with positive values, and treat those with -1 values as outliers
import sys
# pd.set_option("display.max_rows", None, "display.max_columns", None)
np.set_printoptions(threshold=sys.maxsize)
labels=DBSCAN.labels_
labels_df = pd.DataFrame(labels, columns=['cluster'])
labels_df['cluster'].value_counts().sort_index()
These are the outliers or the ones with clusters of value -1, and they will be considered as outliers
X_transactions.iloc[labels_df[ labels_df['cluster']==-1].index, :]
Categorize index of rows with outliers and separate them from those that are not outliers (cluster=0)
outliers_index = X_transactions.iloc[labels_df[ labels_df['cluster']==-1].index, :].index
good_index = X_transactions.iloc[labels_df[ ~labels_df['cluster']==-1].index, :].index
Non-zero values of clusters with value -1 above will be replaced by the median of each transaction feature
good_transactions_median = X.loc[good_index.tolist(), 'Transaction1':'Transaction9'].median()
Replace the outliers with the median (without outliers) for each feature
outliers = X.loc[outliers_index.tolist(), 'Transaction1':'Transaction9'].quantile([0.25])
X.loc[outliers_index.tolist(), 'Transaction1'] = X.loc[outliers_index.tolist(), 'Transaction1'].apply(lambda x: good_transactions_median['Transaction1'] if x >= outliers['Transaction1'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction2'] = X.loc[outliers_index.tolist(), 'Transaction2'].apply(lambda x: good_transactions_median['Transaction2'] if x >= outliers['Transaction2'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction3'] = X.loc[outliers_index.tolist(), 'Transaction3'].apply(lambda x: good_transactions_median['Transaction3'] if x >= outliers['Transaction3'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction4'] = X.loc[outliers_index.tolist(), 'Transaction4'].apply(lambda x: good_transactions_median['Transaction4'] if x >= outliers['Transaction4'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction5'] = X.loc[outliers_index.tolist(), 'Transaction5'].apply(lambda x: good_transactions_median['Transaction5'] if x >= outliers['Transaction5'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction6'] = X.loc[outliers_index.tolist(), 'Transaction6'].apply(lambda x: good_transactions_median['Transaction6'] if x >= outliers['Transaction6'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction7'] = X.loc[outliers_index.tolist(), 'Transaction7'].apply(lambda x: good_transactions_median['Transaction7'] if x >= outliers['Transaction7'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction8'] = X.loc[outliers_index.tolist(), 'Transaction8'].apply(lambda x: good_transactions_median['Transaction8'] if x >= outliers['Transaction8'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction9'] = X.loc[outliers_index.tolist(), 'Transaction9'].apply(lambda x: good_transactions_median['Transaction9'] if x >= outliers['Transaction9'][0.25] else x)
Using the CombinedAttributesAdder()
from sklearn.base import BaseEstimator, TransformerMixin
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, remove_cols=None): # no *args or **kargs
if remove_cols is not None:
self.remove_cols = remove_cols
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
# feature to get % change from rate before and current rate
diff_interest_rate = X[['RateBefore','ReferenceRate']].pct_change(axis=1)['ReferenceRate'].rename('diff_interest_rate')
# feature to get # of product per customer
count_product = X.loc[:,'Product1':'Product6'].sum(axis=1)
# feature to get # of product per customer
count_external_acct = X.loc[:,'ExternalAccount1':'ExternalAccount7'].sum(axis=1)
#feature to get how many transactions customer made out of 9 possible
count_transactions = X.loc[:,'Transaction1':'Transaction9'].applymap(lambda x: 1 if x>0 else 0).sum(axis=1)
# feature to get ave $ of transaction
ave_transactions = X.loc[:,'Transaction1':'Transaction9'].T.mean()
# feature to get max transaction ($) per customer
max_transactions = X.loc[:,'Transaction1':'Transaction9'].T.max()
# feature to get min transaction ($) per customer
min_transactions = X.loc[:,'Transaction1':'Transaction9'].T.min()
# feature to get median transaction ($) per customer
median_transactions = X.loc[:,'Transaction1':'Transaction9'].T.median()
# feature to get standard deviation of transactions ($) per customer
std_transactions = X.loc[:,'Transaction1':'Transaction9'].T.std()
# feature to get sum of transaction ($) per customer
sum_transactions = X.loc[:,'Transaction1':'Transaction9'].T.sum()
# feature to get 1st quartile $ of transaction ($) per customer
q1_transactions = X.loc[:,'Transaction1':'Transaction9'].T.quantile(.25)
# feature to get 2nd quartile/median $ of transaction ($) per customer
q2_transactions = X.loc[:,'Transaction1':'Transaction9'].T.quantile(.5)
# feature to get 3rd quartile $ of transaction ($) per customer
q3_transactions = X.loc[:,'Transaction1':'Transaction9'].T.quantile(.75)
# feature to get IQR of transaction ($) per customer
IQR_transactions = q3_transactions - q1_transactions
# feature to identify outlier of transaction ($) per customer
low_outliers_transactions = q1_transactions - 1.5* IQR_transactions
high_outliers_transactions = q3_transactions + 1.5*IQR_transactions
# feature to get ave rate offered per customer
ave_rate= X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.mean()
# feature to get max rate offered to customer
max_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.max()
# feature to get min rate offered to customer
min_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.min()
# feature to get median rate offered to customer
median_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.median()
# feature to get standard deviation of rates offered to customer
std_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.std()
# attempt to create a better activity indicator using previous campaign result feature
new_act_indicator = X['PreviousCampaignResult'] * X['ActivityIndicator']
# attempt to create a better regular interaction indicator using previous campaign result feature
new_reg_act_indicator = X['PreviousCampaignResult'] * X['RegularInteractionIndicator ']
new_count_product = X.loc[:,'Product1':'Product6'].sum(axis=1) * X['PreviousCampaignResult']
# potential interest savings in $ customer gained from new rate
interest_savings = (X['RateBefore'] - X['ReferenceRate'] ) * X.loc[:,'Transaction1':'Transaction9'].T.sum()
prod_to_ext_acct = count_product.divide(count_external_acct, fill_value=0).fillna(0).replace(np.inf, 0)
newreference_prodcount = (X['ReferenceRate'] * count_product).fillna(0)
newreference_sumtransactions = X['ReferenceRate'] * sum_transactions
regact_diffrate = X['ActivityIndicator'] * diff_interest_rate
ave_transactions_diffrate = X['ActivityIndicator'] * ave_transactions
balance_diffrate = X['Balance'] * diff_interest_rate
if self.remove_cols: # optional based on user input / hyperparameter
X = pd.concat([X, diff_interest_rate.rename('test_diff_interest_rate'), count_external_acct.rename('count_external_acct'), count_transactions.rename('count_transactions'),
ave_transactions.rename('ave_transactions'), max_transactions.rename('max_transactions'),
min_transactions.rename('min_transactions'), median_transactions.rename('median_transactions'), std_transactions.rename('std_transactions'), ave_rate.rename('ave_rate'), min_rate.rename('min_rate'),
median_rate.rename('median_rate'), std_rate.rename('std_rate'), new_count_product.rename('new_count_product'), interest_savings.rename('interest_savings'), prod_to_ext_acct.rename('prod_to_ext_acct'),
newreference_prodcount.rename('newreference_prodcount'), newreference_sumtransactions.rename('newreference_sumtransactions'),
regact_diffrate.rename('regact_diffrate'), balance_diffrate.rename('balance_diffrate'), q1_transactions.rename('q1_transactions'),
q2_transactions.rename('q2_transactions'), q3_transactions.rename('q3_transactions'), IQR_transactions.rename('IQR_transactions'), low_outliers_transactions.rename('low_outliers_transactions'),
high_outliers_transactions.rename('high_outliers_transactions')
], axis=1)
X = X.drop(columns=self.remove_cols)
print("Shape after: ", X.shape)
return X.values
else:
X = pd.concat([X, diff_interest_rate.rename('test_diff_interest_rate'), count_external_acct.rename('count_external_acct'), count_transactions.rename('count_transactions'),
ave_transactions.rename('ave_transactions'), max_transactions.rename('max_transactions'),
min_transactions.rename('min_transactions'), median_transactions.rename('median_transactions'), std_transactions.rename('std_transactions'), ave_rate.rename('ave_rate'), min_rate.rename('min_rate'),
median_rate.rename('median_rate'), std_rate.rename('std_rate'), new_count_product.rename('new_count_product'), interest_savings.rename('interest_savings'), prod_to_ext_acct.rename('prod_to_ext_acct'),
newreference_prodcount.rename('newreference_prodcount'), newreference_sumtransactions.rename('newreference_sumtransactions'),
regact_diffrate.rename('regact_diffrate'), balance_diffrate.rename('balance_diffrate'), q1_transactions.rename('q1_transactions'),
q2_transactions.rename('q2_transactions'), q3_transactions.rename('q3_transactions'), IQR_transactions.rename('IQR_transactions'), low_outliers_transactions.rename('low_outliers_transactions'),
high_outliers_transactions.rename('high_outliers_transactions')
], axis=1)
global cols #create global variable so you can see the column names on RF's feature importance chart
cols=X.columns
return X.values
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
num_pipeline = Pipeline([
('attribs_adder', CombinedAttributesAdder(remove_cols=[])),
('minmax_scaler', MinMaxScaler()),
])
X = num_pipeline.fit_transform(X)
y = y.values.ravel()
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
# Refer to
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
# to identify correct scoring. there are variations of f1 score
## GRADIENT SEARCH TAKING UP TOO MUCH TIME
param_grid = [{ 'n_estimators': np.arange(180, 250, 5).tolist(),
"max_depth": np.arange(2, 15, 4).tolist(),
"min_samples_leaf": [1,2,5,10],
"max_features": [5,10, 20],
"bootstrap": [True, False]}]
RF = RandomForestClassifier()
rf_clf = GridSearchCV(RF, param_grid, cv=5 ,scoring='f1')
rf_clf.fit(X_train, y_train)
Pickle the model for future use
from joblib import dump, load
import pickle
from google.colab import files
import datetime
currentDT = datetime. datetime. now()
date = currentDT.strftime("%Y-%m-%d %H")
pickle.dump(rf_clf, open(f'rf_clf_{date}.joblib', 'wb'))
files.download(f'rf_clf_{date}.joblib')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
yhat_train = rf_clf.predict(X_train)
yhat_valid= rf_clf.predict(X_valid)
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, yhat_train).ravel()
tn_valid, fp_valid, fn_valid, tp_valid = confusion_matrix(y_valid, yhat_valid).ravel()
print("True Negative: ", tn_train, " | False positive: ", fp_train,
" | False negative: ",fn_train, " | True positive: ",tp_train)
print("Recall score: ", recall_score(y_train, yhat_train ))
print("Precision score: ", precision_score(y_train, yhat_train))
print("")
print("True Negative: ", tn_valid, " | False positive: ", fp_valid,
" | False negative: ",fn_valid, " | True positive: ",tp_valid)
print("Recall score: ", recall_score(y_valid, yhat_valid ))
print("Precision score: ", precision_score(y_valid, yhat_valid))
print("Train f1 Score: ", f1_score(y_train, yhat_train))
print("Validation f1 Score: ", f1_score(y_valid, yhat_valid))
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=4, random_state=42)
for train_index, valid_index in skfolds.split(X, y):
clone_clf = clone(rf_clf)
X_train_folds = X[train_index]
y_train_folds = y[train_index]
X_valid_fold = X[valid_index]
y_valid_fold = y[valid_index]
clone_clf.fit(X_train_folds, y_train_folds)
yhat_train = clone_clf.predict(X_train_folds)
print('Train Accuracy: ', accuracy_score(y_train_folds, yhat_train))
print("Train f1 Score: ", f1_score(y_train_folds, yhat_train))
yhat_valid = clone_clf.predict(X_valid_fold)
n_correct = sum(yhat_valid == y_valid_fold)
print('Validation Accuracy: ', accuracy_score(y_valid_fold, yhat_valid))
print("Validation f1 Score: ", f1_score(y_valid_fold, yhat_valid))
print("")
Create dictionary to change feature indices to feature names
col_dict = dict(zip(range(0, len(cols)), cols))
Plot each fearure's relative importance
import matplotlib.pyplot as plt
rf_clf.feature_importances_
importances = rf_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_],
axis=0)
indices = np.argsort(importances)
# Plot the feature importances of the forest
plt.figure(figsize=(80,80))
plt.rcParams.update({'font.size': 40})
plt.title("Feature importances")
plt.barh(range(X.shape[1]), (importances[indices]) ,
color="r", xerr=std[indices], align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(X.shape[1]), [col_dict[k] for k in indices])
plt.ylim([-1, X.shape[1]])
plt.show()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
KNN = KNeighborsClassifier()
param_grid = [{'n_neighbors': [180,200,220,250]}]
grid_search_KNN = GridSearchCV(KNN, param_grid, cv=5 ,scoring='f1')
grid_search_KNN.fit(X_train, y_train)
grid_search_KNN.best_params_
knn_clf = grid_search_KNN.best_estimator_
from joblib import dump, load
import pickle
pickle.dump(knn_clf, open("knn_clf_no_all_outliers.joblib", 'wb'))
from google.colab import files
files.download('knn_clf_no_all_outliers.joblib')
yhat_train = knn_clf.predict(X_train)
yhat_valid= knn_clf.predict(X_valid)
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, yhat_train).ravel()
tn_valid, fp_valid, fn_valid, tp_valid = confusion_matrix(y_valid, yhat_valid).ravel()
print("True Negative: ", tn_train, " | False positive: ", fp_train,
" | False negative: ",fn_train, " | True positive: ",tp_train)
print("Recall score: ", recall_score(y_train, yhat_train ))
print("Precision score: ", precision_score(y_train, yhat_train))
print("")
print("True Negative: ", tn_valid, " | False positive: ", fp_valid,
" | False negative: ",fn_valid, " | True positive: ",tp_valid)
print("Recall score: ", recall_score(y_valid, yhat_valid ))
print("Precision score: ", precision_score(y_valid, yhat_valid))
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [ 0.025, 0.05, 0.1, 0.25,0.35]
for learning_rate in learning_rates:
gb = GradientBoostingClassifier(n_estimators=200, learning_rate = learning_rate, max_features=5, max_depth = 8, random_state = 0)
gb.fit(X_train, y_train)
print("Learning rate: ", learning_rate)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_valid, y_valid)))
from sklearn.metrics import mean_squared_error
from joblib import dump, load
from sklearn.ensemble import GradientBoostingClassifier
errors = [mean_squared_error(y_valid, y_pred)
for y_pred in gb.staged_predict(X_valid)]
bst_n_estimators = np.argmin(errors) + 1
gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=bst_n_estimators, random_state=42) # init=voting_clf tells GB to use voting classifier
gb_clf.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
yhat_train = gb_clf.predict(X_train)
yhat_valid= gb_clf.predict(X_valid)
confusion_matrix(y_train, yhat_train)
precision = precision_score(y_train, yhat_train)
recall =recall_score(y_train, yhat_train)
print("Train Precision: ", precision)
print("Train Recall: ", recall)
print("Train f1 Score: ", f1_score(y_train, yhat_train))
gb_clf.get_params
yhat_valid_precision = precision_score(y_valid, yhat_valid)
yhat_valid_recall = recall_score(y_valid, yhat_valid)
print("Validation Precision: ", yhat_valid_precision)
print("Validation Recall: ", yhat_valid_recall)
print("Validation f1 Score: ", f1_score(y_valid, yhat_valid))
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from joblib import dump, load
svm_clf = SVC(gamma="scale", random_state=42, probability=True)
voting_clf = VotingClassifier(
estimators=[('rf', rf_clf), ('svc', svm_clf), ('knn', knn_clf), ('gb', gb_clf)],
voting='soft')
voting_clf.fit(X_train, y_train)
from sklearn.metrics import f1_score
for clf in (rf_clf, svm_clf, knn_clf, voting_clf):
print(clf)
clf.fit(X_train, y_train)
yhat_valid = clf.predict(X_valid)
print(clf.__class__.__name__, f1_score(y_valid, yhat_valid))
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
# yhat_valid = voting_clf.predict(X_valid)
yhat_valid_precision = precision_score(y_valid, yhat_valid)
yhat_valid_recall = recall_score(y_valid, yhat_valid)
print("Validation Precision: ", yhat_valid_precision)
print("Validation Recall: ", yhat_valid_recall)
print("Validation f1 Score: ", f1_score(y_valid, yhat_valid))
test = test.set_index('Customer_id')
prepared_test = num_pipeline.fit_transform(test)
final_predictions = rf_clf.predict(prepared_test)
pd.concat([pd.DataFrame(test.index), pd.DataFrame(final_predictions)], axis=1).to_csv('submission.csv', index=False, header=['Customer_id', 'Target'])
from google.colab import files
files.download('submission.csv')